Handling Missing Values

# import pandas 
import pandas as pd 
# read ufo data 
ufo = pd.read_csv("http://bit.ly/uforeports")
# last 5 rows 
ufo.tail() 
City Colors Reported Shape Reported State Time
18236 Grant Park NaN TRIANGLE IL 12/31/2000 23:00
18237 Spirit Lake NaN DISK IA 12/31/2000 23:00
18238 Eagle River NaN NaN WI 12/31/2000 23:45
18239 Eagle River RED LIGHT WI 12/31/2000 23:45
18240 Ybor NaN OVAL FL 12/31/2000 23:59
# check missing values 
ufo.isnull() 
City Colors Reported Shape Reported State Time
0 False True False False False
1 False True False False False
2 False True False False False
3 False True False False False
4 False True False False False
... ... ... ... ... ...
18236 False True False False False
18237 False True False False False
18238 False True True False False
18239 False False False False False
18240 False True False False False

18241 rows × 5 columns

Note

  1. True: Missing

  2. False: Not Missing

# using notnull() 
ufo.notnull() 
City Colors Reported Shape Reported State Time
0 True False True True True
1 True False True True True
2 True False True True True
3 True False True True True
4 True False True True True
... ... ... ... ... ...
18236 True False True True True
18237 True False True True True
18238 True False False True True
18239 True True True True True
18240 True False True True True

18241 rows × 5 columns

Note

  1. axis = 0: Rows

  2. axis = 1: Columns

# sum of missing values: by default axis = 0
ufo.isnull().sum() 
City                  25
Colors Reported    15359
Shape Reported      2644
State                  0
Time                   0
dtype: int64
# Let's create a series 
pd.Series([True, False, True]).sum() 
2
# filtering using isnull() 
ufo[ufo.City.isnull()]
City Colors Reported Shape Reported State Time
21 NaN NaN NaN LA 8/15/1943 0:00
22 NaN NaN LIGHT LA 8/15/1943 0:00
204 NaN NaN DISK CA 7/15/1952 12:30
241 NaN BLUE DISK MT 7/4/1953 14:00
613 NaN NaN DISK NV 7/1/1960 12:00
1877 NaN YELLOW CIRCLE AZ 8/15/1969 1:00
2013 NaN NaN NaN NH 8/1/1970 9:30
2546 NaN NaN FIREBALL OH 10/25/1973 23:30
3123 NaN RED TRIANGLE WV 11/25/1975 23:00
4736 NaN NaN SPHERE CA 6/23/1982 23:00
5269 NaN NaN NaN AZ 6/30/1985 21:30
6735 NaN NaN FORMATION TX 4/1/1992 2:00
7208 NaN NaN CIRCLE MI 10/4/1993 17:30
8828 NaN NaN TRIANGLE WA 10/30/1995 21:30
8967 NaN NaN VARIOUS CA 12/8/1995 18:00
9273 NaN NaN TRIANGLE OH 5/1/1996 3:00
9388 NaN NaN OVAL CA 6/12/1996 12:00
9587 NaN NaN EGG FL 8/24/1996 15:00
10399 NaN NaN TRIANGLE IL 6/15/1997 23:00
11625 NaN NaN CIRCLE TX 6/7/1998 7:00
12441 NaN RED FIREBALL WA 10/26/1998 17:58
15767 NaN NaN RECTANGLE NV 1/21/2000 11:30
15812 NaN NaN LIGHT NV 2/2/2000 3:00
16054 NaN GREEN NaN FL 3/11/2000 3:30
16608 NaN NaN SPHERE NY 6/15/2000 15:00
# Check specific column
ufo.City.isnull().sum() 
25

Drop Missing Values

# shape 
ufo.shape 
(18241, 5)
# drop missing: drop row contains missing values 
# it is inplace = False 
ufo.dropna(how='any').shape
(2486, 5)
# how=all 
ufo.dropna(how='all').shape 
(18241, 5)
# subset: any 
ufo.dropna(subset=['City', 'Shape Reported'], how='any').shape
(15576, 5)
# subset: all 
ufo.dropna(subset=['City', 'Shape Reported'], how='all').shape
(18237, 5)

Filling Missing Values

# value counts: by default drop = True 
ufo["Shape Reported"].value_counts() 
LIGHT        2803
DISK         2122
TRIANGLE     1889
OTHER        1402
CIRCLE       1365
SPHERE       1054
FIREBALL     1039
OVAL          845
CIGAR         617
FORMATION     434
VARIOUS       333
RECTANGLE     303
CYLINDER      294
CHEVRON       248
DIAMOND       234
EGG           197
FLASH         188
TEARDROP      119
CONE           60
CROSS          36
DELTA           7
ROUND           2
CRESCENT        2
PYRAMID         1
HEXAGON         1
DOME            1
FLARE           1
Name: Shape Reported, dtype: int64
# value counts: false 
ufo["Shape Reported"].value_counts(dropna=False) 
LIGHT        2803
NaN          2644
DISK         2122
TRIANGLE     1889
OTHER        1402
CIRCLE       1365
SPHERE       1054
FIREBALL     1039
OVAL          845
CIGAR         617
FORMATION     434
VARIOUS       333
RECTANGLE     303
CYLINDER      294
CHEVRON       248
DIAMOND       234
EGG           197
FLASH         188
TEARDROP      119
CONE           60
CROSS          36
DELTA           7
CRESCENT        2
ROUND           2
PYRAMID         1
DOME            1
FLARE           1
HEXAGON         1
Name: Shape Reported, dtype: int64
# fillna() 
ufo["Shape Reported"].fillna(value="VARIOUS", inplace=True)
# now take a look 
ufo["Shape Reported"].value_counts() 
VARIOUS      2977
LIGHT        2803
DISK         2122
TRIANGLE     1889
OTHER        1402
CIRCLE       1365
SPHERE       1054
FIREBALL     1039
OVAL          845
CIGAR         617
FORMATION     434
RECTANGLE     303
CYLINDER      294
CHEVRON       248
DIAMOND       234
EGG           197
FLASH         188
TEARDROP      119
CONE           60
CROSS          36
DELTA           7
CRESCENT        2
ROUND           2
PYRAMID         1
HEXAGON         1
DOME            1
FLARE           1
Name: Shape Reported, dtype: int64